from unstructured.partition.pdf import partition_pdf

# tesseract D:/document/mongoDB/mongodb_selectAndUpate.png output -l chi_sim
# 提取PDF元素
pdf_path: str = "D:/ideaSpace/rag-in-action-master/90-文档-Data/山西文旅/云冈石窟-en.pdf"
pdf_path: str = "D:/ideaSpace/rag-in-action-master/90-文档-Data/山西文旅/云冈石窟-ch.pdf"

 # 先partition再chunk
elements = partition_pdf(pdf_path, strategy="auto", languages=["chi_sim"])

# Basic分块 - 按大小
from unstructured.chunking.basic import chunk_elements
#
chunks = chunk_elements(
    elements,
    max_characters=1000,  # 每块最大字符数
    new_after_n_chars=800,  # 达到800字符后可以开始新块
    overlap=200  # 块间重叠字符数
)

#查看结果
for i, chunk in enumerate(chunks[:3]):  # 打印前3块
    print(f"Chunk {i+1}:")
    print(chunk)
    print("\n" + "="*50 + "\n")
